这一章总结使用 Ollama 开发时的最佳实践,帮助你写出更好的代码。
流式输出提供更好的用户体验:
import ollama
stream = ollama.chat(
model='llama3.2',
messages=[{'role': 'user', 'content': '写一首诗'}],
stream=True
)
for chunk in response:
if chunk['message']['content']:
print(chunk['message']['content'], end='', flush=True)
避免请求无限等待:
import requests
response = requests.post(
"http://localhost:11434/api/chat",
json={"model": "llama3.2", "messages": messages},
timeout=120 # 2 分钟超时
)
妥善处理各种错误:
import ollama
try:
response = ollama.chat(
model='llama3.2',
messages=[{'role': 'user', 'content': '你好'}]
)
return response['message']['content']
except ollama.ResponseError as e:
print(f"API 错误: {e}")
except ollama.RequestError as e:
print(f"请求错误: {e}")
except Exception as e:
print(f"未知错误: {e}")
| 任务 | 推荐模型 |
|---|---|
| 通用对话 | llama3.2 |
| 代码生成 | codellama |
| 多模态 | llava |
| 嵌入 | nomic-embed-text |
| 快速响应 | llama3.2:1b |
| 高质量 | mistral:7b |
import psutil
def select_model():
memory_gb = psutil.virtual_memory().total / (1024 ** 3)
if memory_gb < 8:
return 'llama3.2:1b'
elif memory_gb < 16:
return 'llama3.2:3b'
else:
return 'llama3.2'
model = select_model()
# 创意写作
temperature = 0.8
# 代码生成
temperature = 0.2
# 翻译
temperature = 0.1
# 问答
temperature = 0.3
# 简单对话
num_ctx = 2048
# 文档分析
num_ctx = 8192
# 长文档处理
num_ctx = 16384
system_prompt = """你是一个专业的技术文档撰写者。
你的职责:
1. 编写清晰、准确的技术文档
2. 使用简洁的语言
3. 提供代码示例
4. 解释关键概念
文档格式:
- 使用 Markdown
- 包含代码块
- 添加必要的注释"""
response = ollama.generate(
model='llama3.2',
prompt='生成一个用户信息',
format='json'
)
user = json.loads(response['response'])
response = ollama.chat(
model='llama3.2',
messages=[
{'role': 'system', 'content': '你是一个情感分析助手'},
{'role': 'user', 'content': '这个产品太棒了!'},
{'role': 'assistant', 'content': '正面'},
{'role': 'user', 'content': '服务很糟糕'},
{'role': 'assistant', 'content': '负面'},
{'role': 'user', 'content': '还可以吧'}
]
)
def process_batch(texts):
results = []
for text in texts:
response = ollama.chat(
model='llama3.2',
messages=[{'role': 'user', 'content': text}],
keep_alive='0' # 处理完立即卸载
)
results.append(response['message']['content'])
return results
from concurrent.futures import ThreadPoolExecutor
def batch_process(items, max_workers=3):
def process(item):
return ollama.chat(
model='llama3.2',
messages=[{'role': 'user', 'content': item}]
)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(process, items))
return results
import re
def validate_input(user_input):
if not user_input or len(user_input) > 10000:
raise ValueError("输入无效")
if re.search(r'<script|javascript:', user_input, re.IGNORECASE):
raise ValueError("检测到潜在恶意输入")
return user_input
def safe_chat(user_input):
validated = validate_input(user_input)
return ollama.chat(
model='llama3.2',
messages=[{'role': 'user', 'content': validated}]
)
def filter_output(text):
# 过滤敏感信息
patterns = [
r'password["\s:]+\S+',
r'api[_-]?key["\s:]+\S+',
r'token["\s:]+\S+'
]
filtered = text
for pattern in patterns:
filtered = re.sub(pattern, '[已过滤]', filtered, flags=re.IGNORECASE)
return filtered
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('ollama')
def logged_chat(messages):
logger.info(f"发送请求: {len(messages)} 条消息")
start = time.time()
response = ollama.chat(
model='llama3.2',
messages=messages
)
duration = time.time() - start
logger.info(f"收到响应: {duration:.2f}s")
return response
from unittest.mock import patch
def test_chat():
with patch('ollama.chat') as mock_chat:
mock_chat.return_value = {
'message': {'content': '测试回复'}
}
result = chat_function('你好')
assert result == '测试回复'
def test_ollama_connection():
try:
models = ollama.list()
assert len(models['models']) > 0
print("连接测试通过")
except Exception as e:
print(f"连接测试失败: {e}")